global odir "/Users/sangdong/Documents/KoPDP_2020update/final"
global raw "raw"
global temp "temp"
global result "result"
cd $odir
capture mkdir $temp

* marking singleton family
use $raw/equivalents, clear
bysort appnum: gen single = cond(_N==1, 1, 0)
keep if single==1
keep appnum single
save $temp/singleton, replace


use $raw/equivalents_final, clear
drop _m
merge m:1 appnum using $temp/singleton
replace single = 0 if _m==1
drop _m
merge m:1 epodoc using $raw/family_class_final, keep(1 3)
//     Result                           # of obs.
//     -----------------------------------------
//     not matched                       324,294
//         from master                        51  (_merge==1)
//         from using                    324,243  (_merge==2)
//
//     matched                         4,301,139  (_merge==3)
//     -----------------------------------------

// keep if _m==3
// keep appnum id
// duplicates drop
// duplicates report appnum
// // --------------------------------------
// //    copies | observations       surplus
// // ----------+---------------------------
// //         1 |      3059751             0
// // --------------------------------------
// // _m==3 인 경우 family_id가 일관되게 부여됐음
// // _m==3 인 경우를 이용하여 _m==1 인 경우 family_id를 추론

bysort appnum (id): replace id = id[1] if _m==1
// (0 real changes made)
drop _m appnum epodoc
drop if family == .
duplicates drop
duplicates tag family, gen(dup)
list if dup>0
drop if dup>0
// (930 observations deleted)
drop dup
format id %10.0g

preserve
keep if ctry == "KR"
save $temp/KR, replace
restore
keep if ctry == "US"
save $temp/US, replace

* 최초 출원일 따라 family id 부여
use $temp/KR, clear
rename family appnum
merge m:1 appnum using $raw/numbers, keep(1 3) keepusing(appdate) nogenerate
rename appnum family
save $temp/KR_appdate, replace

use $temp/US, clear
rename family wku
merge m:1 wku using $raw/basic_7618, keep(1 3) keepusing(apd) nogenerate
rename (wku apd) (family appdate)
append using $temp/KR_appdate
save $temp/appdate, replace /* important */

use $temp/appdate, clear
bysort id (appdate): gen double fappdate = appdate[1] /* 패밀리 별 최초 출원일 */
gen first = (appdate == fappdate)
	** 각 패밀리별 최초출원 식별
	preserve
	keep id family first
	drop if family==.
	duplicates drop
	save $temp/first, replace
	restore
keep if first == 1
keep id fappdate
duplicates drop
gsort fappdate
gen new_id = _n
keep id new_id
save $temp/new_id, replace


* 최총파일 완성
use $temp/appdate, clear
// 	** 일부 family 가 복수의 id를 갖는 문제 해결
// 	drop if family == 9853711 & id == 1719610
// 	drop if 

merge m:1 id using $temp/new_id, nogenerate
merge m:1 id family using $temp/first, nogenerate
keep new_id ctry family single first appdate
order new_id ctry family single first appdate 
rename new_id id
gsort id -first

label variable id "임의로 부여한 패밀리 ID"
label variable ctry "패밀리 출원의 출원 대상 국가"
label variable family "패밀리 출원의 appnum/wku"
label variable first "패밀리 내 최초출원 여부"
label variable appdate "family의 출원일"
label variable single "singleton family 여부 (전 세계 출원 고려)"

save $result/family_matching, replace

* 임시파일 삭제
shell rm -r $temp
